Disclaimer: Claude AI was used in this assignment

  • Removed categorical columns
  • Perfomed Linear Regression
  • Reports:
    • Data drift
    • Regression quality metrics
In [ ]:
import warnings
warnings.filterwarnings('ignore')
In [49]:
# Install evidently if needed
# pip install evidently

import evidently
print(f"Evidently version: {evidently.__version__}")
Evidently version: 0.6.4
In [50]:
import pandas as pd
import numpy as np
import requests
import io

from datetime import datetime, time 
from sklearn import datasets, ensemble

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import DatasetDriftMetric, RegressionQualityMetric

from IPython.display import display 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import LabelEncoder
In [51]:
file_path = "/Users/brunamedeiros/Documents/University of Chicago/Summer 2025 - ML Ops/HW4/cancer_reg.csv"

df = pd.read_csv(file_path, encoding='latin-1')
print(f"Dataset shape: {df.shape}")
print(f"Missing values:\n{df.isnull().sum()}")
Dataset shape: (3047, 34)
Missing values:
avgAnnCount                   0
avgDeathsPerYear              0
TARGET_deathRate              0
incidenceRate                 0
medIncome                     0
popEst2015                    0
povertyPercent                0
studyPerCap                   0
binnedInc                     0
MedianAge                     0
MedianAgeMale                 0
MedianAgeFemale               0
Geography                     0
AvgHouseholdSize              0
PercentMarried                0
PctNoHS18_24                  0
PctHS18_24                    0
PctSomeCol18_24            2285
PctBachDeg18_24               0
PctHS25_Over                  0
PctBachDeg25_Over             0
PctEmployed16_Over          152
PctUnemployed16_Over          0
PctPrivateCoverage            0
PctPrivateCoverageAlone     609
PctEmpPrivCoverage            0
PctPublicCoverage             0
PctPublicCoverageAlone        0
PctWhite                      0
PctBlack                      0
PctAsian                      0
PctOtherRace                  0
PctMarriedHouseholds          0
BirthRate                     0
dtype: int64
In [58]:
df.dropna(inplace=True)

df = df.drop(['Geography', 'binnedInc'], axis=1)  # Drop categorical columns
# # Encode categorical variables
# categorical_cols = ['binnedInc', 'Geography']
# label_encoders = {}
# for col in categorical_cols:
#     le = LabelEncoder()
#     df[col] = le.fit_transform(df[col])
#     label_encoders[col] = le  # store encoder if needed later


# create datasets
df_A = df.copy()
df_A['medIncome'] -= 40000

df_AB = df_A.copy()
df_AB['povertyPercent'] += 20

df_ABC = df_AB.copy()
df_ABC['AvgHouseholdSize'] += 2

datasets = {
    "df": df,
    "df_A": df_A,
    "df_AB": df_AB,
    "df_ABC": df_ABC
}

print("You have 4 datasets:")
for name in datasets:
    print(f"- {name}")

# Split features and target
X_dict = {}
y_dict = {}

print(f"\nSplitting features and target...")
for name, data in datasets.items():
    X_dict[name] = data.drop(columns=['TARGET_deathRate'])
    y_dict[name] = data['TARGET_deathRate']

# Train-test split
print(f"\nTrain-test split...")
X_train_dict = {}
y_train_dict = {}
X_test_dict = {}
y_test_dict = {}
for name in datasets.keys():
    X_train, X_test, y_train, y_test = train_test_split(X_dict[name], y_dict[name], test_size=0.2, random_state=42)
    X_train_dict[name] = X_train
    y_train_dict[name] = y_train
    X_test_dict[name] = X_test
    y_test_dict[name] = y_test

print("  Results:")
for name in datasets.keys():
    print(f"    {name} - Train shape: {X_train_dict[name].shape}, Test shape: {X_test_dict[name].shape}")

# Train models
print(f"\nTraining model on original dataset...")
model = LinearRegression()
model.fit(X_train_dict["df"], y_train_dict["df"]) # only train on original dataset

# EVIDENTLY REPORTS

# 1. Add predictions to your each test dataset
print(f"\nAdding predictions to test datasets (to each alternated DF)...")
complete_test_dict = {}
for name in datasets.keys():
    complete_test_dict[name] = X_test_dict[name].copy()
    complete_test_dict[name]['TARGET_deathRate'] = y_test_dict[name]  # Add target
    complete_test_dict[name]['prediction'] = model.predict(X_test_dict[name])  # Add predictions

# 2. Create column mapping
column_mapping = ColumnMapping()
column_mapping.target = 'TARGET_deathRate'
column_mapping.prediction = 'prediction'

# Evidently reports
report_A = Report(metrics=[
DatasetDriftMetric(),          # Shows which features drifted
    RegressionQualityMetric()  # Shows how prediction quality changed
])
report_A.run(reference_data=complete_test_dict["df"], current_data=complete_test_dict["df_A"], column_mapping=column_mapping)
report_A.save_html("data_drift_A.html")

report_AB = Report(metrics=[DatasetDriftMetric(),RegressionQualityMetric()])
report_AB.run(reference_data=complete_test_dict["df"], current_data=complete_test_dict["df_AB"], column_mapping=column_mapping)
report_AB.save_html("data_drift_AB.html")

report_ABC = Report(metrics=[DatasetDriftMetric(),RegressionQualityMetric()])
report_ABC.run(reference_data=complete_test_dict["df"], current_data=complete_test_dict["df_ABC"], column_mapping=column_mapping)
report_ABC.save_html("data_drift_ABC.html")
You have 4 datasets:
- df
- df_A
- df_AB
- df_ABC

Splitting features and target...

Train-test split...
  Results:
    df - Train shape: (472, 31), Test shape: (119, 31)
    df_A - Train shape: (472, 31), Test shape: (119, 31)
    df_AB - Train shape: (472, 31), Test shape: (119, 31)
    df_ABC - Train shape: (472, 31), Test shape: (119, 31)

Training model on original dataset...

Adding predictions to test datasets (to each alternated DF)...
In [59]:
report_A
Out[59]:
In [60]:
report_AB
Out[60]:
In [61]:
report_ABC
Out[61]:

Conclusion: Even though the data drift wasn't significant (smaller than the 0.5 threshold), it significantly impacted model performance. We can see MAE and MAPE worsened with every additional change on the dataset.